# SETUP: We will need tidyverse for almost all of these functionslibrary(tidyverse)# ==============================================================================# USECASE: Re-capitalize stringsx <-c("R4SS: Introduction to R for Social Scientists")str_to_lower(x)str_to_upper(x)str_to_sentence(x)str_to_title(x)# ==============================================================================# USECASE: Extract subsets of stringsx <-c("Apple", "Banana", "Pear")str_sub(x, start =1, end =3)str_sub(x, start =-3, end =-1)str_sub(x, start =2, end =-2)str_sub(x, start =1, end =5) # can go beyond the end# ==============================================================================# USECASE: Remove whitespace from stringsx <-" Sometimes strings have too much white space "xstr_trim(x) # remove white space at the start and endstr_squish(x) # trim and then collapse inner white space# ==============================================================================# USECASE: Remove and replace patterns in stringsx <-"Scientists very often utilize very fancy words, even when they could utilize simpler ones."str_remove(x, pattern ="very ") # removes first pattern match onlystr_remove_all(x, pattern ="very ") # removes all pattern matchesstr_replace(x, pattern ="utilize", replacement ="use")str_replace_all(x, pattern ="utilize", replacement ="use")# NOTE: More complex patterns can be found using regular expressions (regex)# ==============================================================================# USECASE: Create a string manipulation pipelinex_clean <- x |>str_remove_all("very ") |>str_replace_all("utilize", "use") |>print()
::: footer Extra Slides :::I=
If Else
A locked door behaves conditionally
If you have the key, then open up…
Otherwise, stay closed…
Sometimes we want code to behave conditionally
Filter retains observations conditionally (e.g., if it meets a condition, it gets to stay)
Let’s learn to transform variables conditionally
We can use if_else() for simple examples
If Else Live Coding
# SETUP: We will need tidyverse for almost all of these functionslibrary(tidyverse)# ==============================================================================# USECASE: Determining whether someone can vote in the USage <-12age_group <-if_else(condition = age >=18,true ="adult",false ="child" ) |>print()# ==============================================================================# TIP: Because argument names are optional, we can shorten this (if we want)age_group <-if_else(age >=18, "adult", "child") |>print()# ==============================================================================# LESSON: This function is particularly useful applied to vectorsages <-c(13, 18, 14, 19, 22, 16)age_groups <-if_else(ages >=18, "adult", "child") |>print()# ==============================================================================# USECASE: We can therefore use it during data wranglingcereal <-read_csv("cereal.csv", na ="-999")cereal2 <-mutate(cereal, popular =if_else(rating >50, "yes", "no"))cereal2cereal3 <- cereal |>mutate(diabetes =if_else(condition = sugars ==0,true ="sugar-free",false ="contains sugar" ) ) |>print()
Case When
An elevator also behaves conditionally
If you press a button, then it goes to that floor
There are usually more than just two buttons
In this analogy (but not in real life), the elevator only responds to the first button pressed
Sometimes we want code to behave this way
case_when() expands upon if_else()
It can have multiple conditions (floor buttons)
The first condition met “wins” (picks the floor)
Case When Live Coding
# SETUP: We will need tidyverse for almost all of these functionslibrary(tidyverse)# ==============================================================================# USECASE: Determine what types of movies your kids can watchages <-c(11, 13, 18)movies_allowed <-case_when( ages >=17~"R", ages >=13~"PG-13", ages <13~"PG" ) |>print()# ==============================================================================# PITFALL: Don't put the least restrictive condition firstage <-18movies_allowed2 <-case_when( age <13~"PG", age >=13~"PG-13", age >=17~"R" ) |>print() # age >= 13, so PG-13 wins before checking if age >= 17# ==============================================================================# USECASE: Use case_when to re-code variables during data wranglingstarwarssw <- starwars |>mutate(species3 =case_when( species =="Human"~"Human", species =="Droid"~"Droid", species !="Human"& species !="Droid"~"Alien" ) ) |>select(name, species3) |>print()# ==============================================================================# TIP: The next version of case_when() will add the .default argument# This is where the elevator will drop you off if you hit no buttonssw <- starwars |>mutate(species3 =case_when( species =="Human"~"Human", species =="Droid"~"Droid",.default ="Alien" ) ) |>select(name, species3) |>print()# NOTE: The above code won't work now, but it should in a few weeks/months# For now, you can use TRUE ~ "Alien" and it works but is harder to explain
Wrangle X
Across
We can use across() to repeat an operation across multiple variables in a tibble
This makes our code shorter
It is faster to read and write
It is also less error-prone
So we can repeat a function in order to…
…mutate() multiple variables
…summarize() multiple variables
Across Live Coding
# SETUP: We will need tidyverse and an example datasetlibrary(tidyverse)starwars# ==============================================================================# USECASE: Applying the same mutation to multiple variables is a painsw <- starwars |>mutate(hair_color =factor(hair_color),skin_color =factor(skin_color),eye_color =factor(eye_color) ) |>print() # beforesw <- starwars |>mutate(across(.cols =c(hair_color, skin_color, eye_color), .fns = factor ) ) |>print() #after# ==============================================================================# PITFALL: Don't forget to wrap the .cols part in c()sw <- starwars |>mutate(across(.cols = mass, birth_year, .fns = round,digits =1 ) ) |>print() # error# ==============================================================================# LESSON: To pass arguments to the inner function, add them inside across()sw <- starwars |>mutate(across(.cols =c(mass, birth_year), .fns = round,digits =1 ) ) |>print()# ==============================================================================# USECASE: You can also apply the same summary functions across variablessw <- starwars |>summarize(height =mean(height, na.rm =TRUE),mass =mean(mass, na.rm =TRUE),birth_year =mean(birth_year, na.rm =TRUE) ) |>print()sw <- starwars |>summarize(across(.cols =c(height, mass, birth_year), .fns = mean, na.rm =TRUE ) ) |>print()
Separate and Unite
Tidy data needs one value per cell
So we may need to separate cells
e.g., What was the model of my first car?
"Nissan Altima 2003" ⬎
"Nissan""Altima""2003"
But some tasks require us to unite cells
e.g., What address should I mail to?
123"Main Street" ⬎
"123 Main Street"
Separate Live Coding
# SETUP: We will need tidyverse and an example datasetlibrary(tidyverse)# Create some example datadat <-tibble(id =c("A_001_01", "A_002_01", "B_001_01", "B_002_01", "C_001_01", "C_002_01"),duration =c("01:16", "01:21", "01:49", "00:34", "00:32", "00:54") ) |>print()# ==============================================================================# USECASE: Separate a column into multiple columnsdat2 <- dat |>separate(col = duration, into =c("min", "sec"), sep =":" ) |>print()# ==============================================================================# USECASE: This also works with more than two "into" columnsdat2 <- dat |>separate(col = id, into =c("group", "subject", "time"), sep ="_" ) |>print()# ==============================================================================# TIP: To automatically convert strings into numbers, use convertdat2 <- dat |>separate(col = id, into =c("group", "subject", "time"), sep ="_", convert =TRUE ) |>print()# ==============================================================================# PITFALL: Don't forget to close the "into" vector's parenthesesdat2 <- dat|>separate(col = duration, into =c("min", "sec", sep =":")) #error
Unite Live Coding
# SETUP: We will need tidyverse and an example datasetlibrary(tidyverse)dat2 <- dat|>separate(col = id, into =c("group", "subject", "time"), sep ="x") |>separate(col = duration, into =c("min", "sec"), sep =":") |>print()# ==============================================================================# USECASE: Unite multiple columns into one stringdat3 <- dat2 |>unite(col ="newid", group, subject, time, sep ="-") |>unite(col ="duration", min, sec, sep =":") |>print()# ==============================================================================# LESSON: Retain the columns being united with remove = FALSEdat3 <- dat2 |>unite(col ="newid", group:time, sep ="", remove =FALSE) |>print()